D1 Dog bites man.
D2 Man bites dog.
D3 Dog eats meat.
D4 Man eats food.
[dog, bites, man, eats, meat, food]
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
S1 = 'dog bites man'
S2 = 'man bites dog'
S3 = 'dog eats meat'
S4 = 'man eats food'
data = [S1.split(), S2.split(), S3.split(), S4.split()]
values = data[0]+data[1]+data[2]+data[3]
data[['dog', 'bites', 'man'], ['man', 'bites', 'dog'], ['dog', 'eats', 'meat'], ['man', 'eats', 'food']]
['dog', 'bites', 'man', 'man', 'bites', 'dog', 'dog', 'eats', 'meat', 'man', 'eats', 'food']
#Label Encoding
label_encoder = LabelEncoder()
label_encoder.fit_transform(values)
#One-Hot Encodingarray([1, 0, 4, 4, 0, 1, 1, 2, 5, 4, 2, 3], dtype=int64)
array([[1., 0., 1., 0., 0., 0., 1., 0.],
[0., 1., 1., 0., 1., 0., 0., 0.],
[1., 0., 0., 1., 0., 0., 0., 1.],
[0., 1., 0., 1., 0., 1., 0., 0.]])
#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)
#See the BOW rep for first 2 documentsOur vocabulary: {'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}
BoW representation for 'dog bites man': [[1 1 0 0 1 0]]
BoW representation for 'man bites dog: [[1 1 0 0 1 0]]
transform not fit_transform#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)
#Get the representation using this vocabulary, for a new textOur vocabulary: {'dog': 3, 'bites': 0, 'man': 12, 'dog bites': 4, 'bites man': 2, 'dog bites man': 5, 'man bites': 13, 'bites dog': 1, 'man bites dog': 14, 'eats': 8, 'meat': 17, 'dog eats': 6, 'eats meat': 10, 'dog eats meat': 7, 'food': 11, 'man eats': 15, 'eats food': 9, 'man eats food': 16}
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())Bow representation for 'dog and dog are friends': [[0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
sklearn uses a different formula than many other programsfrom sklearn.metrics.pairwise import cosine_similarity
# from bag of words
cosine_similarity(bow_rep, bow_rep)
# from tfidfarray([[1. , 0.5 , 0.16666667, 0.16666667],
[0.5 , 1. , 0.16666667, 0.16666667],
[0.16666667, 0.16666667, 1. , 0.16666667],
[0.16666667, 0.16666667, 0.16666667, 1. ]])
array([[1. , 1. , 0.23864426, 0.23864426],
[1. , 1. , 0.23864426, 0.23864426],
[0.23864426, 0.23864426, 1. , 0.30635374],
[0.23864426, 0.23864426, 0.30635374, 1. ]])
# from tfidf T stands for transpose
cosine_similarity(bow_rep_tfidf.T, bow_rep_tfidf.T)
# same method for bag of words, but we had bag of n-grams array([[1. , 0.85940018, 0. , 0. , 0.85940018,
0. ],
[0.85940018, 1. , 0.36154621, 0. , 0.73856867,
0.51130356],
[0. , 0.36154621, 1. , 0.70710678, 0.36154621,
0.70710678],
[0. , 0. , 0.70710678, 1. , 0.51130356,
0. ],
[0.85940018, 0.73856867, 0.36154621, 0.51130356, 1. ,
0. ],
[0. , 0.51130356, 0.70710678, 0. , 0. ,
1. ]])
subs2vec project for large models based on many languagesarray([-1.4002024 , -2.9394155 , 2.3440564 , -0.2642897 , -0.03686378,
1.108527 , -1.4920337 , 2.8677418 , -1.5980606 , 0.99373615,
2.0346558 , -2.7802339 , -0.8382226 , 0.59061694, 1.3308638 ,
-1.6109508 , 1.1174004 , -0.20566082, 1.5025214 , 0.2391684 ,
-0.24154003, -0.8432145 , -1.625582 , 0.02176754, -0.90974814,
-0.73214424, 1.3413223 , 0.9575398 , 2.3825417 , 0.47476164,
-0.23127365, 0.43896505, 1.0737495 , 1.286182 , 3.4217503 ,
-0.50157917, 1.340261 , 1.5405822 , 0.5391208 , 0.9699144 ,
-2.4776843 , 2.4763935 , 0.64119244, -2.5301526 , -2.7029643 ,
0.23749161, -0.805605 , 2.182995 , 0.05157952, -0.30619007,
1.1573747 , 0.54398054, 0.6189882 , -0.44525868, -1.1290792 ,
0.10181987, -0.6057856 , -0.9006699 , 0.26447338, 0.7151672 ,
-0.9117656 , -3.622216 , -0.10084378, 1.1551071 , 0.9677814 ,
-1.0734687 , -1.9916444 , 2.244074 , 1.3288797 , -0.4738657 ,
1.5999237 , 1.4611611 , 1.957007 , 0.60457355, 0.65467036,
-2.1115632 , 0.73291314, -0.56158 , -1.9893478 , -2.0005374 ,
2.1024084 , 0.8253916 , 0.5908838 , 1.8763747 , 3.3784752 ,
1.2808089 , 2.669356 , -0.07513534, 2.4739594 , 2.2205591 ,
-0.71732175, 1.1982242 , -1.1272483 , -0.23775227, 0.42325893,
-0.21930571, 1.0846907 , -0.03026258, 1.4392033 , -0.28975612],
dtype=float32)
[('delightful', 0.7371078729629517), ('lovely', 0.6907088160514832), ('charming', 0.6905480027198792), ('gentle', 0.6726126670837402), ('wonderful', 0.6654652953147888), ('seductive', 0.6508783102035522), ('gorgeous', 0.6505144834518433), ('cheerful', 0.6472518444061279), ('quiet', 0.6424300670623779), ('sweetest', 0.6386240720748901)]
array([-0.9400318 , -1.9073623 , 0.51388913, 2.1491065 , -0.60443586,
-0.18803059, -0.51881164, -0.59505093, -1.2845879 , 1.3831282 ,
0.2269165 , -1.3870097 , -0.8075259 , -0.1209118 , -1.3422265 ,
0.62562615, 0.6388021 , 0.72818345, -0.87057877, 0.19891827,
-0.12029336, 1.391928 , -0.6556714 , -0.04711962, -1.5190856 ,
1.4182966 , 0.30824763, 0.77387184, 1.1190034 , 0.30155706,
1.2573441 , 1.5616091 , 1.1392734 , 1.1448203 , 0.65536195,
0.31193823, 1.3654766 , 1.0029259 , -0.0639618 , 1.2218494 ,
-0.39288986, 1.0735016 , 0.3121981 , -3.6816125 , -0.39922816,
0.22872576, -0.8371188 , 0.47557637, 1.6749092 , 0.18024959,
0.04610574, 0.65120095, 0.27887535, -1.3650675 , -2.392382 ,
-0.07002597, 0.02706578, -0.46936798, 1.6155813 , -0.72597516,
0.54998606, -0.59589815, -0.75361025, 1.6968971 , 0.03394611,
1.4248078 , -1.3127394 , 1.3107685 , 1.3489949 , 0.9155154 ,
2.188675 , 0.77741283, 1.0098825 , 0.3534072 , 1.078142 ,
0.62115914, 1.1300869 , -0.8866471 , -0.41853163, 0.15080371,
0.20810232, 0.63887495, -0.1412232 , 1.7337992 , 1.8096211 ,
-0.68198216, 1.3986139 , 0.8411694 , 0.38912648, 2.0979743 ,
0.52640474, 0.5799278 , -1.0738829 , 0.03481235, 0.6757732 ,
0.6255503 , 1.4245161 , 0.15055256, 1.1889644 , 0.37116337],
dtype=float32)
[('stupid', 0.7206795811653137), ('scary', 0.712806761264801), ('gorgeous', 0.6989247798919678), ('sexy', 0.6954982280731201), ('cute', 0.6894108057022095), ('awful', 0.6880098581314087), ('silly', 0.6853577494621277), ('dumb', 0.681719958782196), ('laughing', 0.6798037886619568), ('funny', 0.6780909895896912)]
word2vec and correspondingly fastText have a few important embedding variations# Inspect the model by looking for the most similar words for a test word.
print(our_model.wv.most_similar('computer', topn = 5))
# Let us see what the 10-dimensional vector for 'computer' looks like.[('eps', 0.2914133667945862), ('trees', 0.05541810393333435), ('minors', 0.042647670954465866), ('survey', -0.02176341600716114), ('interface', -0.15233567357063293)]
[ 0.0163195 0.00189972 0.03474648 0.00217841 0.09621626 0.05062076
-0.08919986 -0.0704361 0.00901718 0.06394394]
['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']
[('choice', 0.8792965412139893), ('kca', 0.8783688545227051), ('voting', 0.8781742453575134), ('peoples', 0.8458272814750671), ('trend', 0.8411796689033508), ('brazil', 0.8409967422485352), ('votes', 0.8386208415031433), ('janoskians', 0.8291750550270081), ('direction', 0.8237687349319458), ('fandom', 0.8189729452133179)]
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import textacy.preprocessing as tprep
from gensim.models import Word2Vec, KeyedVectors
df = pd.read_csv('senate22_raw.csv', encoding = 'latin1')
documents = list(df['text'])
def preprocess(text):
text = tprep.normalize.quotation_marks(text)
text = tprep.normalize.unicode(text)
text = tprep.remove.accents(text)
text = tprep.replace.urls(text)
text = tprep.replace.user_handles(text)
text = tprep.replace.emojis(text)
stops = set(stopwords.words('english'))
tokens = word_tokenize(text)
return [token.lower() for token in tokens if token.isalpha()
and token not in stops]
documents = [preprocess(document) for document in documents]
base_model = Word2Vec(vector_size=25, min_count=5)
base_model.build_vocab(documents)
total_examples = base_model.corpus_count
# train on your data
base_model.train(documents, total_examples=total_examples, epochs=base_model.epochs)(9573766, 10418945)
[('votes', 0.7640517950057983), ('ballot', 0.763647198677063), ('cast', 0.7401794195175171), ('arkansas', 0.7213475108146667), ('voting', 0.7149344682693481), ('ballots', 0.7145712971687317), ('primaries', 0.6942440271377563), ('registered', 0.6823186874389648), ('november', 0.6582370400428772), ('ticket', 0.6474942564964294)]
# add GloVe's vocabulary & weights
from gensim.scripts.glove2word2vec import glove2word2vec
glove_vectors.save_word2vec_format('glove_vectors.txt', binary=False)
glove_vecs = KeyedVectors.load_word2vec_format('glove_vectors.txt', binary=False)
base_model.build_vocab([list(glove_vecs.index_to_key)], update=True)
# train on your data
base_model.train(documents, total_examples=total_examples, epochs=base_model.epochs)(9573747, 10418945)
[('ballot', 0.7678440809249878), ('votes', 0.7641527652740479), ('cast', 0.7384008765220642), ('ballots', 0.7116597890853882), ('november', 0.710943877696991), ('arkansas', 0.7093214988708496), ('deciding', 0.6925686001777649), ('primaries', 0.6900373101234436), ('voting', 0.6753020286560059), ('registered', 0.6667127013206482)]
# using our trained model
#create flattening function
import numpy as np
def document_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index_to_key)
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = np.zeros((num_features,), dtype="float64")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
feature_vector = np.add(feature_vector, model.wv[word])
if nwords:
feature_vector = np.divide(feature_vector, nwords)
return feature_vector
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
for tokenized_sentence in corpus]
return np.array(features)
# generate averaged word vector features from word2vec model
avg_wv_train_features = document_vectorizer(corpus = common_texts,
model = our_model,
num_features = 10)
common_texts[0]['human', 'interface', 'computer']
array([-0.02466195, -0.01504401, 0.01022091, 0.04354595, 0.06814317,
0.02040245, -0.00338359, 0.02177527, -0.01909124, -0.00113427])
# using spacy's pretrained model
import spacy
nlp = spacy.load("en_core_web_sm")
# Process a sentence using the model
doc = nlp("Canada is a large country")
# vector for 'Canada', the first word in the text
print(doc[0].vector)
# Averaged vector for the entire sentence[-1.74234 -0.90920454 0.41536316 0.15736246 1.2859436 0.24543142
1.2570572 0.35663185 -0.8244102 -0.0674134 1.4712349 0.5119143
-1.3309681 -0.5264146 -1.0188745 -0.8524463 1.2472408 0.2747297
-0.0436547 -0.4842371 -1.2904495 0.42295414 -0.03794765 -0.22511679
-0.4816206 0.36949652 1.2843533 1.4024066 -0.6087295 0.7147388
-0.14381114 -0.9796721 0.452798 0.7162336 -0.5708136 -0.08537036
-0.63481605 0.9896861 -0.474687 3.4676626 -0.9343261 0.29444414
-0.02503309 1.285727 -1.7670362 0.39907005 -0.03138383 2.235859
1.233593 -0.06988642 -0.48538476 1.0872145 -0.8912538 -1.4635974
-0.76645774 -0.4039675 0.86213416 -0.55711997 0.77631915 -0.13158414
-0.3540035 -0.22625872 0.38927513 -0.54100454 0.40940216 -0.5324899
-0.55475163 -0.6075223 0.3275603 -1.6374564 0.7500537 -0.6747781
1.2150496 -0.35457557 -0.85388327 -0.69132215 -0.6772988 -1.405904
-0.5053379 -0.21676248 -0.219181 0.7379973 -0.24607135 -0.960969
0.54404056 0.05432597 0.36030546 -0.17227349 -0.02405006 0.44561887
-0.39934576 -0.57765657 2.6596062 -0.5772871 -0.41377008 0.89937127]
[-2.44864374e-01 -1.56845257e-01 -5.19747622e-02 5.86494267e-01
8.10811967e-02 -1.65754989e-01 7.57052720e-01 2.63185889e-01
1.40734492e-02 2.51211464e-01 2.43307427e-01 -2.79111534e-01
-3.70179832e-01 5.22314429e-01 -5.23915410e-01 4.84695425e-03
4.30857569e-01 -2.19760254e-01 -3.72532457e-01 1.71566337e-01
-2.67529279e-01 2.24802848e-02 -3.03287357e-01 -1.04288436e-01
1.51315406e-01 -5.31261384e-01 4.36048269e-01 2.97305524e-01
4.72418487e-01 3.90211403e-01 2.69951403e-01 2.36672014e-01
4.59462464e-01 -4.97865111e-01 -1.82451054e-01 -1.67997599e-01
1.93978697e-01 5.16766071e-01 -2.88335413e-01 3.74710053e-01
-1.11499667e-01 3.33659947e-01 5.49611822e-02 2.53970414e-01
-5.02043903e-01 3.85194987e-01 -1.86397389e-01 8.60191345e-01
2.11835742e-01 -1.24764726e-01 -7.09948778e-01 7.70933092e-01
-1.79754198e-01 -6.63751960e-01 -4.01271343e-01 1.83464423e-01
-2.96254933e-01 7.63848484e-01 -3.35624158e-01 -1.81755573e-01
5.62856086e-02 -5.20981967e-01 2.32470542e-01 7.93596357e-02
1.17028512e-01 -1.91331297e-01 2.83491552e-01 3.67665291e-03
3.98660034e-01 -5.55250108e-01 2.48089619e-02 2.39709094e-01
3.95606980e-02 -3.22076678e-01 -6.04971290e-01 -7.20350385e-01
-9.31409597e-02 -8.10149968e-01 -5.13003230e-01 -3.73546213e-01
-2.60708272e-01 1.43927217e-01 -2.70993322e-01 -6.63320005e-01
6.42718398e-04 1.37684375e-01 2.20302776e-01 4.54716794e-02
-2.09064916e-01 3.36762726e-01 -1.24832727e-01 -4.11891073e-01
1.14090490e+00 -2.19411142e-02 -4.46765989e-01 4.73975614e-02]
gre, reg, ega, ….ous and then those n-gram embeddings would be combined to create that wordDoc2vec is another implementation that allows the word2vec architecture to learn arbitrary lengths of items (sentences, paragraphs, documents)# convert to numbers for visualization
avg_wv_train_features = document_vectorizer(corpus = tokenized_data,
model = our_model,
num_features = 20)
# create t-sne
tsne = TSNE(n_components = 2,
verbose = 1,
perplexity = 10,
n_iter = 300)
tsne_results = tsne.fit_transform(avg_wv_train_features)[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 17639 samples in 0.000s...
[t-SNE] Computed neighbors for 17639 samples in 1.034s...
[t-SNE] Computed conditional probabilities for sample 1000 / 17639
[t-SNE] Computed conditional probabilities for sample 2000 / 17639
[t-SNE] Computed conditional probabilities for sample 3000 / 17639
[t-SNE] Computed conditional probabilities for sample 4000 / 17639
[t-SNE] Computed conditional probabilities for sample 5000 / 17639
[t-SNE] Computed conditional probabilities for sample 6000 / 17639
[t-SNE] Computed conditional probabilities for sample 7000 / 17639
[t-SNE] Computed conditional probabilities for sample 8000 / 17639
[t-SNE] Computed conditional probabilities for sample 9000 / 17639
[t-SNE] Computed conditional probabilities for sample 10000 / 17639
[t-SNE] Computed conditional probabilities for sample 11000 / 17639
[t-SNE] Computed conditional probabilities for sample 12000 / 17639
[t-SNE] Computed conditional probabilities for sample 13000 / 17639
[t-SNE] Computed conditional probabilities for sample 14000 / 17639
[t-SNE] Computed conditional probabilities for sample 15000 / 17639
[t-SNE] Computed conditional probabilities for sample 16000 / 17639
[t-SNE] Computed conditional probabilities for sample 17000 / 17639
[t-SNE] Computed conditional probabilities for sample 17639 / 17639
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 106.293709
[t-SNE] KL divergence after 300 iterations: 4.532127
TextEvaluator (written by Educational Testing Service), CohMetrix, and more